## libraries
# library(confintr)
# library(rstan)
# library(rstanarm)
library(glue)
library(fs)
# library(vroom)
library(broom)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(scales)
library(ggplot2)
# library(patchwork)
# library(reactable)
library(workflows)
library(workflowsets)
library(rsample)
library(themis)
## Loading required package: recipes
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
# library(recipes)
library(parsnip)
library(finetune)
## Loading required package: tune
# library(tune)
library(dials)
library(yardstick)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
## 
##     discard
library(tidyr)
library(dplyr)

## r session opts
options(scipen = 99L, mc.cores = 8L)
# rstan_options(auto_write = TRUE)
theme_set(theme_minimal())
## seed
seed <- 1234L
set.seed(seed)
## name of independent/outcome/target variable
outcome_col <- "SURGERY"
## indicator levels
ind_lvls <- c("no", "yes")
## event level/label
event_lvl <- "second"
event_lbl <- ind_lvls[2L]
## unique id column
uid_col <- "MEMBER_ID"
## default join spec
join_mbr <- join_by(MEMBER_ID)

## color palette
# cpal <- ggsci::pal_igv(palette = "default", alpha = 1)(51L)

bin_bin_metrics <- metric_set(mcc, f_meas, j_index, bal_accuracy, kap)

Import Data

## walk through data files and load into r env
dir_data() |>
  fs::dir_ls() |>
  purrr::walk(\(x) {
    ## env var name to assign data frame to
    env_obj_nm <- fs::path_file(x) |> str_remove_all(stringr::fixed(".txt"))
    env_obj_nm <- paste("df", tolower(env_obj_nm), sep = "_")
    ## import data
    df <- vroom::vroom(x, delim = " ", show_col_types = FALSE)
    ## check for parsing warnings/issues
    if (nrow(vroom::problems(df)) > 0L) {
      cli::cli_warn("parsing issues found with {.file x}, inspect manually")
    } else {
      ## glimpse data structure for non-test data
      if (!endsWith(env_obj_nm, "test")) {
        cli::cli_h3("{env_obj_nm}")
        glimpse(df, 80L)
      }
      assign(env_obj_nm, df, pos = .GlobalEnv)
    }
  })
## 
## ── df_claims_train
## Rows: 91,786
## Columns: 3
## $ MEMBER_ID <chr> "V2045033839", "U3368542783", "V7794221118", "L9612031535", …
## $ CLAIM_ID  <chr> "DWD-1644TI185", "JAV-0750KB107", "KNZ-3862GG349", "XPI-7191…
## $ LOS_CODE  <chr> "02", "11", "11", "22", "11", "11", "20", "20", "02", "20", …
## Multiple files in zip: reading '[Content_Types].xml'
## 
## ── df_dxccsr-reference-file-v2023-1.xlsx
## Rows: 5
## Columns: 4
## $ `<?xml`              <chr> "<Types", "ContentType=\"application/vnd.openxmlf…
## $ `version="1.0"`      <chr> "xmlns=\"http://schemas.openxmlformats.org/packag…
## $ `encoding="UTF-8"`   <chr> "Extension=\"bin\"", "ContentType=\"application/x…
## $ `standalone="yes"?>` <chr> "ContentType=\"application/vnd.openxmlformats-off…
## 
## ── df_location_of_service
## Rows: 7
## Columns: 2
## $ LOS_DESCR <chr> "Emergency Room", "Outpatient", "Inpatient", "Professional",…
## $ LOS_CODE  <chr> "23", "22", "21", "11", "20", "34", "02"
## 
## ── df_surgeries_test_scored.csv
## Rows: 10,491
## Columns: 1
## $ `MEMBER_ID,PREDICTION,predicted_class` <chr> "C4766121375,1.5577000500949957…
## 
## ── df_surgeries_train
## Rows: 41,965
## Columns: 9
## $ SURGERY             <chr> "no", "no", "no", "no", "no", "no", "no", "no", "n…
## $ MEMBER_ID           <chr> "W2648842706", "H4887816845", "N3902548962", "P318…
## $ ICD10               <chr> "M25611", "Z6854", "M79604", "M71122", "M25549", "…
## $ WEIGHTED_RISK_SCORE <dbl> 5.477114, 4.583526, 5.387169, 5.368089, 4.177909, …
## $ DIABETES            <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ OPIOID_RX           <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ CANCER              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AGE                 <dbl> 25.07722, 33.45827, 37.78651, 18.19988, 49.48217, …
## $ GENDER              <chr> "F", "F", "M", "F", "F", "M", "M", "M", "M", "F", …
## calculate outcome proportion
df_outcome_summ <- df_surgeries_train |> 
  summarise(n = n(), .by = all_of(outcome_col)) |>
  mutate(prop = n / sum(n))

## positive event/no info rate
outcome_pos_prop <- df_outcome_summ |> filter(SURGERY == "yes") |> pull(prop)
# glue("Positive Event Rate: {percent(outcome_pos_prop, .01)}")
no_info_rt <- outcome_pos_prop
if (no_info_rt < .5) no_info_rt <- 1 - no_info_rt
df_outcome_summ |>
  # rename_with()
  reactable::reactable(columns = list(
    SURGERY = reactable::colDef("Surgery?"),
    n = reactable::colDef("Count", format = reactable::colFormat(separators = T)),
    prop = reactable::colDef("Proportion", format = reactable::colFormat(percent = T, digits = 2L))
  ))

Analysis

Dx Codes

df_icd_summ_base <- df_surgeries_train |>
  summarise(
    value = n(),
    .by = c(SURGERY, ICD10)
    ) |>
  pivot_wider(names_from = SURGERY, values_fill = 0L) |>
  rowwise() |>
  mutate(total = sum(c_across(!ICD10))) |>
  ungroup() |>
  arrange(total, yes) |>
  mutate(
    prop = total / sum(total),
    cume_prop = cumsum(total) / sum(total),
    cume_pos_prop = cumsum(yes) / sum(yes)
    ) |>
  arrange(desc(cume_prop))

df_icd_summ_base |>
  mutate(idx = row_number()) |>
  select(idx, starts_with("cume_")) |>
  pivot_longer(cols = !idx) |>
  ggplot(aes(idx, value, col = name)) +
    geom_line() +
    geom_vline(xintercept = 89, col = "coral2", lty = 2L) +
    scale_y_continuous(labels = label_percent()) +
    scale_x_continuous(labels = label_comma()) +
    labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)

df_icd_summ_base |> slice_head(n = 89L)
## # A tibble: 89 × 7
##    ICD10     no   yes total    prop cume_prop cume_pos_prop
##    <chr>  <int> <int> <int>   <dbl>     <dbl>         <dbl>
##  1 M79642   249     8   257 0.00612     1             1    
##  2 M79632   250     4   254 0.00605     0.994         0.991
##  3 M25621   249     4   253 0.00603     0.988         0.986
##  4 M25529   238     9   247 0.00589     0.982         0.982
##  5 M545     241     4   245 0.00584     0.976         0.972
##  6 M79676   237     6   243 0.00579     0.970         0.967
##  7 M79621   238     4   242 0.00577     0.964         0.960
##  8 M25662   240     2   242 0.00577     0.959         0.956
##  9 M25551   231    10   241 0.00574     0.953         0.954
## 10 M25531   235     5   240 0.00572     0.947         0.942
## # ℹ 79 more rows
# df_icd_summ_base |> slice_tail(n = -89L)
## Dx codes by number of characters
df_surgeries_train |>
  mutate(len = nchar(ICD10)) |>
  summarise(
    n = n(),
    n_distinct = n_distinct(ICD10),
    .by = len
    )
## # A tibble: 4 × 3
##     len     n n_distinct
##   <int> <int>      <int>
## 1     6 23852        367
## 2     5 10624        387
## 3     4  7215        270
## 4     3   274         12
## trim DX codes to first n characters
## common first 5 characters summary
df_icd_5_summ_base <- df_surgeries_train |> 
  mutate(ICD = str_sub(ICD10, 1L, 5L)) |>
  summarise_dx()

df_icd_5_summ_base |>
  mutate(idx = row_number()) |>
  select(idx, starts_with("cume_")) |>
  pivot_longer(cols = !idx) |>
  ggplot(aes(idx, value, col = name)) +
    geom_line() +
    geom_vline(xintercept = 22, col = "coral2", lty = 2L) +
    geom_hline(yintercept = .6, col = "coral2", lty = 2L) +
    coord_cartesian(xlim = c(0L, 90L)) +
    scale_y_continuous(labels = label_percent()) +
    scale_x_continuous(labels = label_comma()) +
    labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)

## common first 4 characters summary (excluding common 5 character codes)
df_icd_4_summ_base <- df_surgeries_train |>
  mutate(ICD = str_sub(ICD10, 1L, 4L)) |>
  summarise_dx()
## plot common 4 character codes
df_icd_4_summ_base |>
  mutate(idx = row_number()) |>
  select(idx, starts_with("cume_")) |>
  pivot_longer(cols = !idx) |>
  ggplot(aes(idx, value, col = name)) +
    geom_line() +
    geom_vline(xintercept = 4, col = "coral2", lty = 2L) +
    geom_hline(yintercept = .6, col = "coral2", lty = 2L) +
    coord_cartesian(xlim = c(0L, 45L)) +
    scale_y_continuous(labels = label_percent()) +
    scale_x_continuous(labels = label_comma()) +
    labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)

## most common 4 character codes
icd_4_v <- df_icd_4_summ_base |> slice_head(n = 4L) |> pull(ICD)

## common first 3 characters summary (excluding common 4 & 5 character codes)
df_icd_3_summ_base <- df_surgeries_train |>
  mutate(
    ICD_4 = str_sub(ICD10, 1L, 4L),
    ICD_4_IND = case_match(ICD_4, icd_4_v ~ "yes", .default = "no"),
    ICD = case_match(ICD_4_IND, "no" ~ str_sub(ICD10, 1L, 3L), .default = NA_character_)
    ) |>
  filter(!is.na(ICD)) |>
  summarise_dx()
## plot common 3 character codes
df_icd_3_summ_base |>
  mutate(idx = row_number()) |>
  select(idx, starts_with("cume_")) |>
  pivot_longer(cols = !idx) |>
  ggplot(aes(idx, value, col = name)) +
    geom_line() +
    coord_cartesian(xlim = c(0L, 45L)) +
    scale_y_continuous(labels = label_percent()) +
    scale_x_continuous(labels = label_comma()) +
    labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)

## most common 3 character codes
icd_3_v <- df_icd_3_summ_base |> slice_head(n = 7L) |> pull(ICD)

## common first 2 characters summary (excluding common 3, 4, & 5 character codes)
df_icd_2_summ_base <- df_surgeries_train |>
  mutate(
    ICD_4 = str_sub(ICD10, 1L, 4L),
    ICD_4_IND = case_match(ICD_4, icd_4_v ~ TRUE, .default = FALSE),
    ICD_3 = str_sub(ICD10, 1L, 3L),
    ICD_3_IND = case_match(ICD_3, icd_3_v ~ TRUE, .default = FALSE),
    ICD_TRIM_IND = case_when(!ICD_3_IND & !ICD_4_IND ~ "no", .default = "yes"),
    ICD = case_match(ICD_TRIM_IND, "no" ~ str_sub(ICD10, 1L, 2L), .default = NA_character_)
    ) |>
  filter(!is.na(ICD)) |>
  summarise_dx()
## plot common 2 character codes
df_icd_2_summ_base |>
  mutate(idx = row_number()) |>
  select(idx, starts_with("cume_")) |>
  pivot_longer(cols = !idx) |>
  ggplot(aes(idx, value, col = name)) +
    geom_line() +
    scale_y_continuous(labels = label_percent()) +
    scale_x_continuous(labels = label_comma()) +
    labs(title = "Cumulative % of Dx", x = "Dx Count", y = NULL)

## most common 2 character codes
icd_2_v <- df_icd_2_summ_base |> slice_head(n = 12L) |> pull(ICD)

## add common Dx code indicators
df_icd_trim <- df_surgeries_train |> select(all_of(c(uid_col, outcome_col)), ICD10)
for (i in sort(c(icd_4_v, icd_3_v, icd_2_v))) {
  df_icd_trim <- df_icd_trim |> mutate(
    "{i}" := case_match(str_sub(ICD10, 1L, nchar(i)), i ~ "yes", .default = "no")
    )
}
df_icd_trim <- df_icd_trim |> select(!ICD10)

## calculate outcome association
icd_metrics <- function(x) {
  # x = "M256"
  dat <- df_icd_trim |>
    rename_with(\(y) "estimate", all_of(x)) |>
    select(all_of(c(uid_col, outcome_col)), estimate) |>
    distinct() |>
    mutate(across(c(all_of(outcome_col), estimate), \(x) factor(x, ind_lvls)))
  
  cm <- conf_mat(dat, truth = SURGERY, estimate = estimate)
  cramers_v <- confintr::ci_cramersv(cm$table)
  cramers_v <- c(cramers_v$estimate, cramers_v$interval) |>
    vctrs::vec_set_names(c("cramers_v", "cramers_v_lb", "cramers_v_ub"))
  odds_ratio <- confintr::ci_oddsratio(cm$table)
  odds_ratio <- log(c(odds_ratio$estimate, odds_ratio$interval)) |>
    vctrs::vec_set_names(c("odds_ratio", "odds_ratio_lb", "odds_ratio_ub"))
  out_1 <- bind_cols(t(cramers_v), t(odds_ratio))
  out_2 <- dat |>
    bin_bin_metrics(truth = SURGERY, estimate = estimate, event_level = event_lvl) |>
    select(!.estimator) |>
    pivot_wider(names_from = .metric, values_from = .estimate)
  bind_cols(out_1, out_2)
}
df_icd_metrics_res <- tibble(ICD = c(icd_4_v, icd_3_v, icd_2_v)) |>
  mutate(data = map(ICD, icd_metrics)) |>
  unnest(data)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `data = map(ICD, icd_metrics)`.
## Caused by warning in `ci_chisq_ncp()`:
## ! Upper limit outside search range. Set to the maximum of the parameter range.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
df_icd_metrics_res |> arrange(desc(cramers_v))
## # A tibble: 23 × 12
##    ICD   cramers_v cramers_v_lb cramers_v_ub odds_ratio odds_ratio_lb
##    <chr>     <dbl>        <dbl>        <dbl>      <dbl>         <dbl>
##  1 M54     0.00878            0       0.0190     -0.417        -0.933
##  2 M26     0.00627            0       0.0166      0.201        -0.130
##  3 Z6      0.00558            0       0.0159     -0.229        -0.664
##  4 M256    0.00552            0       0.0159     -0.119        -0.334
##  5 M6      0.00505            0       0.0154      0.237        -0.267
##  6 R2      0.00447            0       0.0149      0.209        -0.294
##  7 K7      0.00428            0       0.0147     -0.247        -0.886
##  8 F11     0.00417            0       0.0146      0.179        -0.277
##  9 F1      0.00395            0       0.0144      0.113        -0.179
## 10 M70     0.00331            0       0.0137     -0.127        -0.530
## # ℹ 13 more rows
## # ℹ 6 more variables: odds_ratio_ub <dbl>, mcc <dbl>, f_meas <dbl>,
## #   j_index <dbl>, bal_accuracy <dbl>, kap <dbl>
df_icd_metrics_res |>
  select(ICD, starts_with("odds_ratio")) |>
  mutate(
    ICD = factor(ICD, ordered = TRUE),
    ICD = reorder(ICD, desc(abs(odds_ratio)))
    ) |>
  ggplot(aes(ICD)) +
    geom_point(aes(y = odds_ratio)) +
    geom_errorbar(aes(ymin = odds_ratio_lb, ymax = odds_ratio_ub)) +
    scale_y_continuous(labels = label_percent()) +
    # scale_x_continuous(labels = label_comma()) +
    theme(axis.text.x = element_text(angle = 90L, hjust = 1L, vjust = .5))

    labs(title = "Log Odds Ratio", x = "Dx Count", y = NULL)
## $x
## [1] "Dx Count"
## 
## $y
## NULL
## 
## $title
## [1] "Log Odds Ratio"
## 
## attr(,"class")
## [1] "labels"
## select top Dx codes
icds_v <- df_icd_metrics_res |> slice_max(abs(odds_ratio), n = 10L) |> pull(ICD)
df_icd_trim <- df_icd_trim |> select(all_of(c(uid_col, sort(icds_v)))) 

Find most predictive Dx codes based onf

df_icd_3_base <- df_surgeries_train |>
  mutate(estimate = str_sub(ICD10, 1L, 3L)) |>
  select(all_of(c(uid_col, outcome_col)), estimate) |>
  mutate(across(all_of(outcome_col), \(x) factor(x, ind_lvls)))
icd_3_metrics <- function(x) {
  dat <- df_icd_3_base |>
    mutate(estimate = factor(
      case_match(estimate, x ~ "yes", .default = "no"),
      ind_lvls
      ))
  cm <- conf_mat(dat, truth = SURGERY, estimate = estimate)
  cramers_v <- confintr::ci_cramersv(cm$table)
  cramers_v <- c(cramers_v$estimate, cramers_v$interval) |>
    vctrs::vec_set_names(c("cramers_v", "cramers_v_lb", "cramers_v_ub"))
  odds_ratio <- confintr::ci_oddsratio(cm$table)
  odds_ratio <- log(c(odds_ratio$estimate, odds_ratio$interval)) |>
    vctrs::vec_set_names(c("odds_ratio", "odds_ratio_lb", "odds_ratio_ub"))
  out_1 <- bind_cols(t(cramers_v), t(odds_ratio))
  out_2 <- dat |>
    bin_bin_metrics(truth = SURGERY, estimate = estimate, event_level = event_lvl) |>
    select(!.estimator) |>
    pivot_wider(names_from = .metric, values_from = .estimate)
  bind_cols(out_1, out_2)
}
df_icd_3_metric_res <- df_icd_3_base |>
  summarise(n = n(), .by = estimate) |>
  mutate(prop = n / sum(n)) |>
  filter((n / sum(n)) > .01) |>
  select(icd_3 = estimate) |>
  mutate(data = map(icd_3, icd_3_metrics)) |>
  unnest(data) |>
  arrange(desc(abs(odds_ratio)))
icd_3_top_v <- df_icd_3_metric_res |>
  slice_head(n = 11L) |>
  pull(icd_3)

Quick Summary

summary(df_surgeries_train)
##    SURGERY           MEMBER_ID            ICD10           WEIGHTED_RISK_SCORE
##  Length:41965       Length:41965       Length:41965       Min.   : 1.112     
##  Class :character   Class :character   Class :character   1st Qu.: 4.548     
##  Mode  :character   Mode  :character   Mode  :character   Median : 5.229     
##                                                           Mean   : 5.244     
##                                                           3rd Qu.: 5.932     
##                                                           Max.   :10.810     
##     DIABETES        OPIOID_RX           CANCER              AGE       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :13.28  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:26.70  
##  Median :0.0000   Median :0.00000   Median :0.000000   Median :35.03  
##  Mean   :0.0921   Mean   :0.05438   Mean   :0.002359   Mean   :35.14  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:43.21  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.000000   Max.   :59.21  
##     GENDER         
##  Length:41965      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
df_surgeries_train |>
  summarise(across(everything(), \(x) mean(is.na(x)))) |>
  pivot_longer(everything()) |>
  arrange(desc(value), name)
## # A tibble: 9 × 2
##   name                value
##   <chr>               <dbl>
## 1 AGE                     0
## 2 CANCER                  0
## 3 DIABETES                0
## 4 GENDER                  0
## 5 ICD10                   0
## 6 MEMBER_ID               0
## 7 OPIOID_RX               0
## 8 SURGERY                 0
## 9 WEIGHTED_RISK_SCORE     0

No NULL’s?

df_mbr_clm_cts <- process_clms(df_claims_train)
feat_clm_type_col <- setdiff(colnames(df_mbr_clm_cts), uid_col)
feat_col <- colnames(df_surgeries_train) |>
  setdiff(c(uid_col, outcome_col, "ICD10", "GENDER")) |>
  c(icd_3_top_v, "MALE_IND", feat_clm_type_col) |>
  c(feat_clm_type_col) 
bin_col <- c("OPIOID_RX", "CANCER", "DIABETES", "MALE_IND", icd_3_top_v)
# multi_col <- c("ICD")
cont_col <- feat_col |> setdiff(bin_col)
feat_form <- formula(paste(outcome_col, paste(feat_col, collapse = " + "), sep = " ~ "))

df_train_base <- df_surgeries_train |>
  left_join(df_mbr_clm_cts, by = join_mbr) |>
  left_join(process_icds(df_surgeries_train), by = join_mbr) |>
  mutate(
    MALE_IND = case_match(GENDER, "M" ~ "yes", .default = "no"),
    CANCER = case_match(CANCER, 1L ~ "yes", .default = "no"),
    DIABETES = case_match(DIABETES, 1L ~ "yes", .default = "no"),
    OPIOID_RX = case_match(OPIOID_RX, 1L ~ "yes", .default = "no"),
    across(all_of(feat_clm_type_col), \(x) coalesce(x, 0)),
    across(all_of(c(outcome_col, bin_col)), \(x) factor(x, ind_lvls))
    ) |>
  select(all_of(c(uid_col, outcome_col, feat_col)))

## check for nulls
df_train_base |>
  summarise(across(all_of(feat_col), \(x) mean(is.na(x)))) |>
  pivot_longer(everything()) |>
  arrange(desc(value), name)
## # A tibble: 23 × 2
##    name           value
##    <chr>          <dbl>
##  1 AGE                0
##  2 CANCER             0
##  3 DIABETES           0
##  4 EMERGENCY_ROOM     0
##  5 F10                0
##  6 F11                0
##  7 G56                0
##  8 INPATIENT          0
##  9 K85                0
## 10 M25                0
## # ℹ 13 more rows
map(bin_col, \(x) {
  df_train_base |>
    rename_with(\(y) "var", all_of(x)) |>
    summarise(n = n(), .by = c(var, SURGERY)) |>
    mutate(prop = n / sum(n), .by = SURGERY) |>
    ggplot(aes(SURGERY, prop, fill = var)) +
      geom_col(position = "dodge", alpha = .7, show.legend = FALSE) + 
      facet_grid(rows = vars(var), scales = "free_y") +
      scale_y_continuous(labels = label_percent()) +
      labs(title = x, y = "Proportion by Surgery Indicator")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

map(cont_col, \(x) {
  df_train_base |>
    rename_with(\(y) "var", all_of(x)) |>
    ggplot(aes(var, fill = SURGERY)) +
      geom_histogram(bins = 30L, alpha = .7, show.legend = FALSE) +
      facet_grid(rows = vars(SURGERY), scales = "free_y") +
      scale_y_continuous(labels = label_comma(), trans = "log1p") +
      labs(title = glue::glue("{x} by {outcome_col}"), x = NULL)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## continuous feature intercorrelation
df_train_base |>
  select(all_of(cont_col)) |>
  corrr::correlate() |>
  corrr::rearrange(method = "HC") |>
  corrr::autoplot()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'

## binary feature intercorrelation -- matthew's correlation coefficient
df_train_base |>
  select(all_of(c(outcome_col, bin_col))) |>
  corrr::colpair_map(.f = mcc_vec) |>
  corrr::rearrange(method = "HC") |>
  corrr::autoplot()

Process Data

df_mbr_clm_cts <- process_clms(df_claims_train)
feat_clm_type_col <- setdiff(colnames(df_mbr_clm_cts), uid_col)
excl_col <- c(
  "TELEHEALTH", "CANCER", "DIABETES", "GENDER", "URGENT_CARE", "AGE",
  "PROFESSIONAL", "G56", "K85", "F11", "M25", "F10", "M71", "M60" ,"Z62", "M26"
  )
feat_col <- colnames(df_surgeries_train) |>
  setdiff(c(uid_col, outcome_col, "ICD10")) |>
  # c(icds_v) |>
  c(icd_3_top_v) |>
  c(feat_clm_type_col) |>
  setdiff(excl_col)
bin_col <- c("OPIOID_RX", icd_3_top_v) |> setdiff(excl_col)
# multi_col <- c("ICD")
cont_col <- feat_col |> setdiff(bin_col)
feat_form <- formula(paste(outcome_col, paste(feat_col, collapse = " + "), sep = " ~ "))

df_train_base <- process_predict(df_surgeries_train, df_claims_train)
df_train_base |>
  summarise(across(all_of(feat_col), \(x) mean(is.na(x)))) |>
  pivot_longer(everything()) |>
  arrange(desc(value), name)
## # A tibble: 7 × 2
##   name                value
##   <chr>               <dbl>
## 1 EMERGENCY_ROOM          0
## 2 INPATIENT               0
## 3 M54                     0
## 4 M70                     0
## 5 OPIOID_RX               0
## 6 OUTPATIENT              0
## 7 WEIGHTED_RISK_SCORE     0
map(bin_col, \(x) {
  df_train_base |>
    rename_with(\(y) "var", all_of(x)) |>
    summarise(n = n(), .by = c(var, SURGERY)) |>
    mutate(prop = n / sum(n), .by = SURGERY) |>
    ggplot(aes(SURGERY, prop, fill = var)) +
      geom_col(position = "dodge", alpha = .7, show.legend = FALSE) + 
      facet_grid(rows = vars(var), scales = "free_y") +
      scale_y_continuous(labels = label_percent()) +
      labs(title = x, y = "Proportion by Surgery Indicator")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

map(cont_col, \(x) {
  df_train_base |>
    rename_with(\(y) "var", all_of(x)) |>
    ggplot(aes(var, fill = SURGERY)) +
      geom_histogram(bins = 30L, alpha = .7, show.legend = FALSE) +
      facet_grid(rows = vars(SURGERY), scales = "free_y") +
      scale_y_continuous(labels = label_comma()) +
      labs(title = glue::glue("{x} by {outcome_col}"), x = NULL)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## continuous feature intercorrelation
df_train_base |>
  select(all_of(cont_col)) |>
  corrr::correlate(quiet = TRUE) |>
  corrr::rearrange(method = "HC") |>
  corrr::autoplot()

## binary feature intercorrelation -- matthew's correlation coefficient
df_train_base |>
  select(all_of(c(outcome_col, bin_col))) |>
  corrr::colpair_map(.f = mcc_vec) |>
  corrr::rearrange(method = "HC") |>
  corrr::autoplot()

Split Data

Split data into train/validation and define resampling object

## training/validation split
tv_split <- df_train_base |> rsample::initial_split(prop = .7)
## resamples object (cross-validation)
df_folds <- training(tv_split) |> vfold_cv(v = 4L, repeats = 4L)

Model Specs

Logistic Regression Specs

## logistic regression recipe
log_recipe <- recipe(formula = feat_form, data = training(tv_split)) |>
  step_downsample(under_ratio = 2L) |>
  step_mutate_at(all_of(bin_col), fn = \(x) {
    case_match(x, "yes" ~ 1L, .default = 0L)
  }) |>
  # step_dummy(OPIOID_RX) |>
  # step_rename(OPIOID_RX = OPIOID_RX_yes) |>
  # step_interact(terms = ~ OUTPATIENT:OPIOID_RX) |>
  # step_rm(OPIOID_RX) |>
  step_log(all_of(cont_col), offset = 1) |>
  step_zv(all_predictors()) |> 
  step_normalize(all_numeric_predictors()) 

## logistic regression model spec
# library(multilevelmod)
t_prior <- rstanarm::student_t(df = 7, location = 0, scale = 2.5)
log_spec <- logistic_reg(mode = "classification") |>
  set_engine(
    engine = "stan",
    family = binomial(link = "logit"),
    prior = t_prior,
    prior_intercept = t_prior,
    seed = seed,
    cores = 8L
    )
# log_spec <- logistic_reg(
#   mode = "classification",
#   engine = "glmnet",
#   penalty = tune(),
#   mixture = 0
#   )

## logistic regression workflow
log_wf <- workflow() |> 
  add_recipe(log_recipe) |> 
  add_model(log_spec)

# ## tree parameter ranges
# log_param <- extract_parameter_set_dials(log_wf) |>
#   update(
#     penalty = penalty(c(-9, -3))
#     # mixture = mixture(c(0, 1))
#     )
# # tree parameter grid
# log_grid <- grid_max_entropy(log_param, size = 4L, variogram_range = 2 / 3)

Decision Tree Specs

## tree recipe
tree_recipe <- recipe(formula = feat_form, data = training(tv_split)) |>
  step_downsample(under_ratio = 2L)

## tree model spec
tree_spec <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
  ) |>
  set_mode("classification") |>
  set_engine(engine = "rpart")

## tree workflow
tree_wf <- workflow() |> 
  add_recipe(tree_recipe) |> 
  add_model(tree_spec) 

## tree parameter ranges
tree_param <- extract_parameter_set_dials(tree_wf) |>
  update(
    cost_complexity = cost_complexity(c(-10, -2)),
    tree_depth = tree_depth(c(4L, 8L)),
    min_n = min_n(c(30L, 120L))
    )

## tree parameter grid
tree_grid <- grid_max_entropy(tree_param, size = 9L, variogram_range = 2 / 3)

Random Forest

## random forest model spec
rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 300L) |>
  set_mode("classification") |>
  set_engine(
    engine = "ranger",
    max.depth   = 10L, #tune("max.depth"),
    # regularization.factor = tune("regularization_factor"),
    # regularization.usedepth = TRUE,
    num.threads = 8L
    )

## random forest workflow
rf_wf <- workflow() |> add_recipe(tree_recipe) |> add_model(rf_spec)

## random forest parameter ranges
rf_param <- extract_parameter_set_dials(rf_wf) |>
  update(
    mtry = mtry(c(2L, 4L)),
    min_n = min_n(c(20L, 50L))
    # trees = trees(c(300L, 600L))
    # max.depth = tree_depth(c(8L, 12L))
    # regularization_factor = regularization_factor(c(0, 1))
    )

## random forest parameter grid
rf_grid <- grid_max_entropy(rf_param, size = 8L, variogram_range = 2 / 3)
model_wf_set <- as_workflow_set(
  log = log_wf,
  tree = tree_wf,
  rf = rf_wf
  )

Training

Tuning

## tune grid control object
ctrl_tune_grid <- control_resamples(verbose = FALSE, event_level = event_lvl)
ctrl_tune_search <- control_sim_anneal(
  verbose = FALSE, no_improve = 8L, restart = 8L, event_level = event_lvl,
  save_workflow = T
  )
## evaluation metrics
eval_metrics <- metric_set(gain_capture, roc_auc, average_precision)

## random tune grid
set.seed(seed)
tune_grid_res <- model_wf_set |>
  option_add(id = "tree", param_info = tree_param, grid = tree_grid) |>
  option_add(id = "rf", param_info = rf_param, grid = rf_grid) |>
  workflow_map(
    fn = "tune_grid",
    seed = seed,
    verbose = FALSE,
    resamples = df_folds,
    metrics = eval_metrics,
    control = ctrl_tune_grid
    )
## tune grid results
autoplot(tune_grid_res, metric = "roc_auc")

tune_search_res <- tune_grid_res |>
  option_remove(grid) |>
  # model_wf_set |>
  # filter(wflow_id == "tree") |>
  filter(wflow_id != "log") |>
  # option_add(id = "tree", initial = extract_workflow_set_result(tune_grid_res, id = "tree")) |>
  # option_add(id = "rf", initial = extract_workflow_set_result(tune_grid_res, id = "rf")) |>
  workflow_map(
    fn = "tune_sim_anneal",
    verbose = FALSE,
    seed = seed,
    # resamples = df_folds,
    iter = 8L,
    # metrics = eval_metrics,
    control = ctrl_tune_search
    )
## Warning: There are existing options that are being modified
##  tree: 'control'
##  rf: 'control'
## Optimizing gain_capture
## Initial best: 0.70065
## 1 ◯ accept suboptimal  gain_capture=0.70065 (+/-0.01733)
## 2 ♥ new best           gain_capture=0.70335 (+/-0.01457)
## 3 ─ discard suboptimal gain_capture=0.6742 (+/-0.008733)
## 4 ♥ new best           gain_capture=0.80484 (+/-0.01746)
## 5 ─ discard suboptimal gain_capture=0.72608 (+/-0.01125)
## 6 ◯ accept suboptimal  gain_capture=0.76546 (+/-0.01878)
## 7 + better suboptimal  gain_capture=0.79854 (+/-0.0176)
## 8 ♥ new best           gain_capture=0.81571 (+/-0.01542)
## Optimizing gain_capture
## Initial best: 0.96057
## 1 ♥ new best           gain_capture=0.96284 (+/-0.001395)
## 2 ◯ accept suboptimal  gain_capture=0.96188 (+/-0.001268)
## 3 + better suboptimal  gain_capture=0.9624 (+/-0.001374)
## 4 ◯ accept suboptimal  gain_capture=0.96233 (+/-0.001378)
## 5 + better suboptimal  gain_capture=0.96274 (+/-0.001379)
## 6 ◯ accept suboptimal  gain_capture=0.96051 (+/-0.001507)
## 7 + better suboptimal  gain_capture=0.96245 (+/-0.001388)
## 8 ◯ accept suboptimal  gain_capture=0.96102 (+/-0.001482)
# tune_search_res <- structure(
#   tune_search_res |> bind_rows(tune_grid_res |> filter(wflow_id != "tree")),
#   class = c("workflow_set", class(tune_search_res))
#   )

## tune grid results
autoplot(tune_search_res, metric = c("roc_auc", "average_precision"))

# autoplot(tune_grid_res, id = "log", metric = "roc_auc")
autoplot(tune_search_res, id = "tree", metric = "roc_auc")

autoplot(tune_search_res, id = "rf", metric = "roc_auc")

Last Fit

## logistic
## stan
log_fit <- log_wf |> fit(data = training(tv_split))
## glm/glmnet
# log_fit_param <- tune_grid_res |>
#   extract_workflow_set_result(id = "log") |>
#   select_by_pct_loss(
#     desc(penalty), mixture,
#     metric = "gain_capture",
#     limit = 2
#     )

## decision tree
tree_fit_param <- tune_search_res |>
  extract_workflow_set_result(id = "tree") |>
  select_by_pct_loss(
    tree_depth, desc(min_n), desc(cost_complexity), 
    metric = "gain_capture",
    limit = 2
    )
tree_fit <- tree_wf |>
  finalize_workflow(tree_fit_param) |>
  fit(data = training(tv_split))

## random forest
rf_fit_param <- tune_search_res |>
  extract_workflow_set_result(id = "rf") |> #collect_metrics()
  select_by_pct_loss(
    mtry, desc(min_n), #trees,
    metric = "gain_capture",
    limit = 2
    )
rf_fit <- rf_wf |>
  finalize_workflow(rf_fit_param) |>
  fit(data = training(tv_split))
rf_imp_fit <- rf_wf |>
  update_model(
    spec = rand_forest(mtry = tune(), min_n = tune(), trees = 300L) |>
      set_mode("classification") |>
      set_engine(
        engine = "ranger",
        importance = "impurity_corrected",
        max.depth   = 10L, #tune("max.depth"),
        num.threads = 8L
        )
    )|>
  finalize_workflow(rf_fit_param) |>
  fit(data = training(tv_split))

## model fit list
fit_ls <- list(log = log_fit, tree = tree_fit, rf = rf_fit)

Model Evaluation

Feature Importance

log_stan_fit <- log_fit |> extract_fit_engine()

bayesplot::color_scheme_set("viridis")
# log_stan_fit |> plot()
log_stan_fit |> plot(plotfun = "areas", prob = 0.9)

# log_stan_fit |> plot(plotfun = "combo")
## logistic
df_log_imp <- broom.mixed::tidyMCMC(log_stan_fit) |>
  filter(term != "(Intercept)") |>
  mutate(
    Variable = term,
    Importance = 100 * abs(estimate) / sum(abs(estimate)),
    .keep = "none"
    )

## tree
df_tree_imp <- tree_fit |> extract_fit_parsnip() |> vip::vi(scale = TRUE)

## random forest
df_rf_imp <- rf_imp_fit |> extract_fit_parsnip() |> vip::vi(scale = TRUE)

## plot
bind_rows(
  df_log_imp |> mutate(wflow_id = "log"),
  df_tree_imp |> mutate(wflow_id = "tree"),
  df_rf_imp |> mutate(wflow_id = "rf")
  ) |>
  order_var_fct() |>
  ggplot(aes(Importance, Variable)) +
    geom_col() +
    facet_grid(cols = vars(wflow_id)) +
    labs(title = "Feature Importance", y = NULL)

Decision Tree Plot

extract_fit_engine(tree_fit) |>
  rpart.plot::prp(
    type = 2, varlen = 0, faclen = 0, extra = 106, roundint = FALSE, digits = 3
    )

Density Plots

df_to_score <- training(tv_split) |>
  mutate(data_set = "train") |>
  bind_rows(testing(tv_split)) |>
  mutate(
    data_set = coalesce(data_set, "vald"),
    across(all_of(outcome_col), \(x) factor(x, ind_lvls))
    )

df_eval <- imap(fit_ls, \(x, y) {
  df_to_score |>
    mutate(
      estimate = predict(x, pick(all_of(feat_col)), type = "prob")$.pred_yes,
      wflow_id = {{ y }}
      ) |>
  select(all_of(c(uid_col, outcome_col)), estimate, data_set, wflow_id)
  }) |>
  list_rbind()
# df_eval <- df_to_score |>
#   mutate(
#     estimate = predict(rf_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes,
#     ) |>
#   select(all_of(c(uid_col, outcome_col, estimate, wflow_id)))

df_eval |>
  group_by(data_set, wflow_id) |>
  ggplot(aes(estimate, fill = SURGERY)) +
    geom_density(alpha = .7) +
    facet_grid(rows = vars(SURGERY), cols = vars(wflow_id), scales = "free_y") +
    scale_x_continuous(breaks = seq(0, .75, .25), labels = label_percent(), trans = "sqrt") +
    scale_fill_viridis_d() +
    theme(
      legend.position = "top", legend.justification = "right",
      legend.margin = margin(t = -13, r = 5, b = -7)
      ) +
    labs(title = "Predicted Probability Density")

Metrics

Table

## model metric table
df_eval |>
  group_by(data_set, wflow_id) |>
  eval_metrics(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
  select(!.estimator) |>
  mutate(.estimate = round(.estimate, 3)) |>
  pivot_wider(names_from = data_set, values_from = .estimate) |>
  mutate(
    .metric = str_replace_all(.metric, "_", " "),
    spread = vald - train
    ) |>
  reactable::reactable(
    columns = list(
      .metric = reactable::colDef("Metric", minWidth = 125L),
      wflow_id = reactable::colDef("Model", maxWidth = 75L),
      train = reactable::colDef("Train", format = reactable::colFormat(percent = T, digits = 2L)),
      vald = reactable::colDef("Validation", format = reactable::colFormat(percent = T, digits = 2L)),
      spread = reactable::colDef("Validation - Train", format = reactable::colFormat(percent = T, digits = 2L))
      ),
    groupBy = ".metric",
    highlight = T, #outlined = T
    bordered = T
    )

Plots

## roc auc curve
df_eval |>
  group_by(data_set, wflow_id) |>
  roc_curve(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
  # autoplot()
  filter(!is.infinite(.threshold)) |>
  ggplot(aes(1 - specificity, sensitivity, color = wflow_id)) +
    geom_line(aes(lty = data_set)) +
    geom_abline(slope = 1, intercept = 0, lty = 3L) +
    scale_x_continuous(labels = label_percent(), expand = c(0,0)) +
    scale_y_continuous(labels = label_percent(), expand = c(0,0)) +
    labs(title = "ROC AUC", lty = "Dataset", color = "Model")

## lift curve
df_eval |>
  group_by(data_set, wflow_id) |>
  lift_curve(truth = all_of(outcome_col), estimate, event_level = event_lvl) |>
  # autoplot()
  filter(!is.infinite(.lift)) |>
  ggplot(aes(.percent_tested, .lift, color = wflow_id)) +
    geom_line(aes(lty = data_set)) +
    scale_x_continuous(labels = label_percent(scale = 1), expand = c(0,0)) +
    scale_y_continuous(labels = label_math(~ .x * x), expand = c(0,0)) +
    labs(title = "Lift", x = "% Tested", y = "Lift", lty = "Dataset", color = "Model")
## Warning: Removed 6 rows containing missing values (`geom_line()`).

Predicted Probability Threshold

Choose threshold based on the J-Index using the weighted average of the training/validation set’s.

  • training weight: 1L
  • validation weight: 2L
## score with logistic model
df_log_eval <- df_to_score |>
  mutate(
    estimate = predict(log_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes
    ) |>
  select(all_of(c(uid_col, outcome_col)), data_set, estimate)

## feature distributions by predicted probability decile
df_log_eval |>
  select(all_of(uid_col), estimate) |>
  bind_cols(bake(prep(log_recipe), new_data = df_to_score)) |>
  mutate(across(where(is.factor), \(x) case_match(x, "yes" ~ 1L, .default = 0L))) |>
  pivot_longer(cols = !c(all_of(c(uid_col, outcome_col)), estimate)) |>
  mutate(decile = factor(ntile(estimate, n = 10L), seq_len(10L), ordered = TRUE)) |>
  ggplot(aes(decile, value, fill = decile)) +
    geom_violin(alpha = .8, show.legend = FALSE) +
    facet_wrap(~ name, scales = "free_y")

## performance by predicted probability threshold
prob_thresholds <- seq(.0025, .1, .0025)
df_thresh <- df_log_eval |>
  group_by(data_set) |>
  probably::threshold_perf(
    truth = all_of(outcome_col),
    estimate = estimate,
    thresholds = prob_thresholds,
    event_level = event_lvl
    )

df_thresh |>
  ggplot(aes(.threshold, .estimate, col = data_set)) +
    geom_line() +
    facet_grid(rows = vars(.metric), scale = "free_y") +
    scale_x_continuous(
      breaks = c(.005, .01, .025, .05, .1), labels = label_percent(),
      trans = "sqrt", expand = c(0,0)
      ) +
    scale_y_continuous(labels = label_percent(), trans = "log1p") +
    theme(
      legend.position = "top", legend.justification = "right",
      legend.margin = margin(t = -13, r = 5, b = -7)
      ) +
    labs(title = "Predicted Probability Threshold Evaluation", col = "Data Set")

## select optimal threshold
event_thresh <- df_thresh |>
  filter(.metric == "j_index") |>
  mutate(wt = case_match(data_set, "vald" ~ 2L, .default = 1L)) |>
  summarise(.estimate = weighted.mean(.estimate, wt), .by = .threshold) |>
  slice_max(.estimate)
event_thresh |> glue::glue_data(
  "Predicted Probability Threshold for a Positive Event: {percent(.threshold, .01)}"
)
## Predicted Probability Threshold for a Positive Event: 4.25%

Score Test

## test set predictions
df_test_base <- process_predict(df_surgeries_test, df_claims_test, dataset = "test") |>
  mutate(
    PREDICTION = predict(log_fit, pick(all_of(feat_col)), type = "prob")$.pred_yes,
    predicted_class = factor(
      case_when(PREDICTION > event_thresh$.estimate ~ "yes", .default = "no"),
      ind_lvls
      )
    ) |>
  select(all_of(uid_col), PREDICTION, predicted_class)

## class prediction summary table
df_test_base |>
  summarise(n = n(), .by = predicted_class) |>
  mutate(prop = round(n / sum(n), 4))
## # A tibble: 2 × 3
##   predicted_class     n   prop
##   <fct>           <int>  <dbl>
## 1 no              10390 0.990 
## 2 yes               101 0.0096
## predicted probability distribution
df_test_base |>
  ggplot(aes(PREDICTION)) +
  geom_density(fill = "coral2", alpha = .8) +
  scale_x_continuous(labels = label_percent(), trans = "sqrt")

df_test_base |>
  readr::write_csv(dir_data("Surgeries_Test_Scored.csv"))
## Registered S3 methods overwritten by 'readr':
##   method                    from 
##   as.data.frame.spec_tbl_df vroom
##   as_tibble.spec_tbl_df     vroom
##   format.col_spec           vroom
##   print.col_spec            vroom
##   print.collector           vroom
##   print.date_names          vroom
##   print.locale              vroom
##   str.col_spec              vroom

I think the surgery that we are attempting to predict is shoulder surgery based on the most common Dx codes. If a 2nd/3rd guesses count for anything then I’d go with knee surgery as the next most likely followed by the dark horse of dental surgery, although I think the Dx codes related to dental surgery may have been .